{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, subprocess, random\n",
    "from pathlib import Path, PurePath\n",
    "from shutil import copyfile\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Prepare data experiment very similar to the way described in the paper: "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "GOOGLE_DATA_PATH=Path('data/audio_google_original/')\n",
    "GOOGLE_ORIG_DATA=GOOGLE_DATA_PATH/'original/'\n",
    "TEST_LIST_FILE=GOOGLE_ORIG_DATA/'testing_list.txt'\n",
    "VALIDATION_LIST_FILE=GOOGLE_ORIG_DATA/'validation_list.txt'\n",
    "GOOGLE_DATA_BACKGROUND_PATH=GOOGLE_ORIG_DATA/'_background_noise_'\n",
    "GOOGLE_DATA_BACKGROUND_SPLITTED_PATH=GOOGLE_ORIG_DATA/'_background_noise_splitted_'\n",
    "GOOGLE_DATA_BACKGROUND_SPLITTED_PATH.mkdir(exist_ok=True)\n",
    "\n",
    "TEST_FOLDER=GOOGLE_DATA_PATH/'test'\n",
    "TRAIN_FOLDER=GOOGLE_DATA_PATH/'train'\n",
    "\n",
    "LABELS_USED=[\"yes\", \"no\", \"up\", \"down\", \"left\", \"right\", \"on\", \"off\", \"stop\", \"go\", '_background_noise_splitted_']\n",
    "UNKNOW_CATEGORY='unknown'\n",
    "NO_TALK_CATEGORY='_background_noise_'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split noize files 1 sec files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_wav(input_file, output_path):\n",
    "    filename_wo_suffix=str(input_file).split('\\\\')[-1].replace('.wav','')\n",
    "    print(filename_wo_suffix)\n",
    "    output_file_begin=str(output_path/filename_wo_suffix)\n",
    "    print(output_file_begin)\n",
    "    process = subprocess.Popen(['ffmpeg', '-i',  str(input_file), '-f', 'segment', '-segment_time', '1',\n",
    "                                '-c', 'copy', f'{output_file_begin}%03d.wav'],\n",
    "                     stdout=subprocess.PIPE, \n",
    "                     stderr=subprocess.PIPE)\n",
    "    stdout, stderr = process.communicate()\n",
    "    return stdout, stderr\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "background_files=list(GOOGLE_DATA_BACKGROUND_PATH.glob('*.wav'))\n",
    "len(background_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "doing_the_dishes\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\doing_the_dishes\n",
      "dude_miaowing\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\dude_miaowing\n",
      "exercise_bike\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\exercise_bike\n",
      "pink_noise\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\pink_noise\n",
      "running_tap\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\running_tap\n",
      "white_noise\n",
      "data\\audio_google_original\\original\\_background_noise_splitted_\\white_noise\n"
     ]
    }
   ],
   "source": [
    "for background_file in background_files:\n",
    "    split_wav(background_file, GOOGLE_DATA_BACKGROUND_SPLITTED_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "402"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splitted_background_files=[str(fl).replace('\\\\','/') for fl in GOOGLE_DATA_BACKGROUND_SPLITTED_PATH.rglob('*.wav')]\n",
    "len(splitted_background_files)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get list of all files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "106237"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_files=[str(fl).replace('\\\\','/') for fl in GOOGLE_ORIG_DATA.rglob('*.wav')]\n",
    "\n",
    "len(all_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['data/audio_google_original/original/backward/0165e0e8_nohash_0.wav',\n",
       " 'data/audio_google_original/original/backward/017c4098_nohash_0.wav',\n",
       " 'data/audio_google_original/original/backward/017c4098_nohash_1.wav',\n",
       " 'data/audio_google_original/original/backward/017c4098_nohash_2.wav',\n",
       " 'data/audio_google_original/original/backward/017c4098_nohash_3.wav']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_files[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "106231"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### exclude original noize files\n",
    "all_files=[fl for fl in all_files if '/_background_noise_/' not in fl]\n",
    "len(all_files)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make train file list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'data/audio_google_original/original/testing_list.txt'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TEST_LIST_FILE.as_posix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11006"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_files=TEST_LIST_FILE.read_text().split('\\n')\n",
    "len(test_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['right/bb05582b_nohash_3.wav',\n",
       " 'right/97f4c236_nohash_2.wav',\n",
       " 'right/f2e59fea_nohash_3.wav',\n",
       " 'right/fdb5155e_nohash_2.wav',\n",
       " 'right/dc75148d_nohash_0.wav']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_files[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9982"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation_files=VALIDATION_LIST_FILE.read_text().split('\\n')\n",
    "len(validation_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['right/a69b9b3e_nohash_0.wav',\n",
       " 'right/439c84f4_nohash_1.wav',\n",
       " 'right/409c962a_nohash_1.wav',\n",
       " 'right/dbaf8fc6_nohash_2.wav',\n",
       " 'right/a6d586b7_nohash_1.wav']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation_files[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_train_list(all_files, test_files, validation_files, orig_files_folder='data/audio_google_original/original/'):\n",
    "    test_files_fullpath=[]\n",
    "    for file in test_files:\n",
    "        test_files_fullpath.append(orig_files_folder+file)\n",
    "        \n",
    "    validation_files_fullpath=[]\n",
    "    for file in validation_files:\n",
    "        validation_files_fullpath.append(orig_files_folder+file)\n",
    "        \n",
    "    train_files=set(all_files)-set(test_files_fullpath)-set(validation_files_fullpath)\n",
    "    \n",
    "    train_files=list(train_files)\n",
    "    return train_files, test_files_fullpath, validation_files_fullpath, "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(85245, 11006, 9982)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_files, test_files, valid_files=make_train_list(all_files, test_files, validation_files)\n",
    "len(train_files), len(test_files), len(valid_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[fl for fl in test_files if '_background_noise_' in fl]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Add backgournd noise files to test set (yes I knwo they are also in train set but there arent many of them so lets use them)\n",
    "test_files=test_files+splitted_background_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11408"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_files)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Copy files to destination folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "95227"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#as this https://arxiv.org/pdf/1804.03209.pdf says that validation set is used to adjust metrics in training I'll add it to traiingset\n",
    "train_files_w_valid=train_files+valid_files\n",
    "len(train_files_w_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.2360278542137052"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.random()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "working on file 0\n",
      "working on file 1000\n",
      "working on file 2000\n",
      "working on file 3000\n",
      "working on file 4000\n",
      "working on file 5000\n",
      "working on file 6000\n",
      "working on file 7000\n",
      "working on file 8000\n",
      "working on file 9000\n",
      "working on file 10000\n",
      "working on file 11000\n"
     ]
    }
   ],
   "source": [
    "def copy_orig2model_folder(files, folder2copy, unk_catgory_p=0.06):\n",
    "    #don't want to have all rest in unk, 6% makes roughly equal categories\n",
    "    for i, fl in enumerate(files):\n",
    "        if i%1000==0:\n",
    "            print(f'working on file {i}')\n",
    "        folder, filename =fl.split('/')[-2:]\n",
    "        if folder not in LABELS_USED:\n",
    "            if random.random()>unk_catgory_p:\n",
    "                continue\n",
    "            filename=f'{folder}_{filename}'\n",
    "            folder=UNKNOW_CATEGORY\n",
    "        dest_folder=folder2copy/folder\n",
    "        if not os.path.isdir(dest_folder):\n",
    "            dest_folder.mkdir()\n",
    "        dest_file=dest_folder/filename\n",
    "        try:\n",
    "            copyfile(fl, dest_file)\n",
    "        except Exception as e:\n",
    "            print(f'Exception occured at file {fl}')\n",
    "            \n",
    "copy_orig2model_folder(test_files, TEST_FOLDER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "working on file 0\n",
      "working on file 1000\n",
      "working on file 2000\n",
      "working on file 3000\n",
      "working on file 4000\n",
      "working on file 5000\n",
      "working on file 6000\n",
      "working on file 7000\n",
      "working on file 8000\n",
      "working on file 9000\n",
      "working on file 10000\n",
      "working on file 11000\n",
      "working on file 12000\n",
      "working on file 13000\n",
      "working on file 14000\n",
      "working on file 15000\n",
      "working on file 16000\n",
      "working on file 17000\n",
      "working on file 18000\n",
      "working on file 19000\n",
      "working on file 20000\n",
      "working on file 21000\n",
      "working on file 22000\n",
      "working on file 23000\n",
      "working on file 24000\n",
      "working on file 25000\n",
      "working on file 26000\n",
      "working on file 27000\n",
      "working on file 28000\n",
      "working on file 29000\n",
      "working on file 30000\n",
      "working on file 31000\n",
      "working on file 32000\n",
      "working on file 33000\n",
      "working on file 34000\n",
      "working on file 35000\n",
      "working on file 36000\n",
      "working on file 37000\n",
      "working on file 38000\n",
      "working on file 39000\n",
      "working on file 40000\n",
      "working on file 41000\n",
      "working on file 42000\n",
      "working on file 43000\n",
      "working on file 44000\n",
      "working on file 45000\n",
      "working on file 46000\n",
      "working on file 47000\n",
      "working on file 48000\n",
      "working on file 49000\n",
      "working on file 50000\n",
      "working on file 51000\n",
      "working on file 52000\n",
      "working on file 53000\n",
      "working on file 54000\n",
      "working on file 55000\n",
      "working on file 56000\n",
      "working on file 57000\n",
      "working on file 58000\n",
      "working on file 59000\n",
      "working on file 60000\n",
      "working on file 61000\n",
      "working on file 62000\n",
      "working on file 63000\n",
      "working on file 64000\n",
      "working on file 65000\n",
      "working on file 66000\n",
      "working on file 67000\n",
      "working on file 68000\n",
      "working on file 69000\n",
      "working on file 70000\n",
      "working on file 71000\n",
      "working on file 72000\n",
      "working on file 73000\n",
      "working on file 74000\n",
      "working on file 75000\n",
      "working on file 76000\n",
      "working on file 77000\n",
      "working on file 78000\n",
      "working on file 79000\n",
      "working on file 80000\n",
      "working on file 81000\n",
      "working on file 82000\n",
      "working on file 83000\n",
      "working on file 84000\n",
      "working on file 85000\n"
     ]
    }
   ],
   "source": [
    "copy_orig2model_folder(train_files, TRAIN_FOLDER)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overview of files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34486"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_files_fullpath=list(TRAIN_FOLDER.rglob('*.wav'))\n",
    "len(train_files_fullpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4889"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_files_fullpath=list(TEST_FOLDER.rglob('*.wav'))\n",
    "len(test_files_fullpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_wav_info(wav_path):\n",
    "    wav_path=str(wav_path)\n",
    "    process = subprocess.Popen(['sox','--i',  wav_path],\n",
    "                     stdout=subprocess.PIPE, \n",
    "                     stderr=subprocess.PIPE)\n",
    "    stdout, stderr = process.communicate()\n",
    "    return stdout, stderr\n",
    "\n",
    "def format_output(output_str):\n",
    "    info_dict={}\n",
    "    for row in output_str.split('\\n'):\n",
    "\n",
    "        pieces=row.split(' : ')\n",
    "        if len(pieces)==2:\n",
    "            info_dict[pieces[0].strip()]=pieces[1].strip()\n",
    "    return info_dict\n",
    "\n",
    "def get_wavs_info(wavs_path):\n",
    "    files_info=[]\n",
    "    for wav in wavs_path:\n",
    "        info_raw=get_wav_info(wav)\n",
    "        info_formatted=format_output(info_raw[0].decode('utf-8'))\n",
    "        files_info.append(info_formatted)\n",
    "    return pd.DataFrame(files_info)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_info=get_wavs_info(test_files_fullpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4889, 7)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_info.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_info=get_wavs_info(train_files_fullpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(34486, 7)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_info.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Input File</th>\n",
       "      <th>Channels</th>\n",
       "      <th>Sample Rate</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Duration</th>\n",
       "      <th>File Size</th>\n",
       "      <th>Bit Rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4884</th>\n",
       "      <td>'data\\audio_google_original\\test\\_background_n...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors</td>\n",
       "      <td>32.8k</td>\n",
       "      <td>257k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4885</th>\n",
       "      <td>'data\\audio_google_original\\test\\_background_n...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors</td>\n",
       "      <td>32.8k</td>\n",
       "      <td>257k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4886</th>\n",
       "      <td>'data\\audio_google_original\\test\\_background_n...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors</td>\n",
       "      <td>32.8k</td>\n",
       "      <td>257k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4887</th>\n",
       "      <td>'data\\audio_google_original\\test\\_background_n...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:00.90 = 14336 samples ~ 67.2 CDDA sectors</td>\n",
       "      <td>28.8k</td>\n",
       "      <td>257k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4888</th>\n",
       "      <td>'data\\audio_google_original\\test\\_background_n...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:00.99 = 15872 samples ~ 74.4 CDDA sectors</td>\n",
       "      <td>31.8k</td>\n",
       "      <td>257k</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             Input File Channels Sample Rate  \\\n",
       "4884  'data\\audio_google_original\\test\\_background_n...        1       16000   \n",
       "4885  'data\\audio_google_original\\test\\_background_n...        1       16000   \n",
       "4886  'data\\audio_google_original\\test\\_background_n...        1       16000   \n",
       "4887  'data\\audio_google_original\\test\\_background_n...        1       16000   \n",
       "4888  'data\\audio_google_original\\test\\_background_n...        1       16000   \n",
       "\n",
       "     Precision                                         Duration File Size  \\\n",
       "4884    16-bit  00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors     32.8k   \n",
       "4885    16-bit  00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors     32.8k   \n",
       "4886    16-bit  00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors     32.8k   \n",
       "4887    16-bit  00:00:00.90 = 14336 samples ~ 67.2 CDDA sectors     28.8k   \n",
       "4888    16-bit  00:00:00.99 = 15872 samples ~ 74.4 CDDA sectors     31.8k   \n",
       "\n",
       "     Bit Rate  \n",
       "4884     257k  \n",
       "4885     257k  \n",
       "4886     257k  \n",
       "4887     257k  \n",
       "4888     257k  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_info.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Input File</th>\n",
       "      <th>Channels</th>\n",
       "      <th>Sample Rate</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Duration</th>\n",
       "      <th>File Size</th>\n",
       "      <th>Bit Rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>4889</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>49</td>\n",
       "      <td>36</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>'data\\audio_google_original\\test\\right\\aa48c94...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:01.00 = 16000 samples ~ 75 CDDA sectors</td>\n",
       "      <td>32.0k</td>\n",
       "      <td>256k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4889</td>\n",
       "      <td>4149</td>\n",
       "      <td>4149</td>\n",
       "      <td>4405</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Input File Channels  \\\n",
       "count                                                4889     4889   \n",
       "unique                                               4889        1   \n",
       "top     'data\\audio_google_original\\test\\right\\aa48c94...        1   \n",
       "freq                                                    1     4889   \n",
       "\n",
       "       Sample Rate Precision                                       Duration  \\\n",
       "count         4889      4889                                           4889   \n",
       "unique           1         1                                             49   \n",
       "top          16000    16-bit  00:00:01.00 = 16000 samples ~ 75 CDDA sectors   \n",
       "freq          4889      4889                                           4149   \n",
       "\n",
       "       File Size Bit Rate  \n",
       "count       4889     4889  \n",
       "unique        36        5  \n",
       "top        32.0k     256k  \n",
       "freq        4149     4405  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_info.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Input File</th>\n",
       "      <th>Channels</th>\n",
       "      <th>Sample Rate</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Duration</th>\n",
       "      <th>File Size</th>\n",
       "      <th>Bit Rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>34486</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>117</td>\n",
       "      <td>79</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>'data\\audio_google_original\\train\\right\\e4b025...</td>\n",
       "      <td>1</td>\n",
       "      <td>16000</td>\n",
       "      <td>16-bit</td>\n",
       "      <td>00:00:01.00 = 16000 samples ~ 75 CDDA sectors</td>\n",
       "      <td>32.0k</td>\n",
       "      <td>256k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>34486</td>\n",
       "      <td>30864</td>\n",
       "      <td>30864</td>\n",
       "      <td>33417</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               Input File Channels  \\\n",
       "count                                               34486    34486   \n",
       "unique                                              34486        1   \n",
       "top     'data\\audio_google_original\\train\\right\\e4b025...        1   \n",
       "freq                                                    1    34486   \n",
       "\n",
       "       Sample Rate Precision                                       Duration  \\\n",
       "count        34486     34486                                          34486   \n",
       "unique           1         1                                            117   \n",
       "top          16000    16-bit  00:00:01.00 = 16000 samples ~ 75 CDDA sectors   \n",
       "freq         34486     34486                                          30864   \n",
       "\n",
       "       File Size Bit Rate  \n",
       "count      34486    34486  \n",
       "unique        79        5  \n",
       "top        32.0k     256k  \n",
       "freq       30864    33417  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_info.describe(include='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Duration to time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dur2sec(dur_str):\n",
    "    #assume duration is in seconds doesnt look for hours, minutes\n",
    "    dur_str_pieces=dur_str.split(':')\n",
    "    return float(dur_str_pieces[-1])\n",
    "\n",
    "def get_dur(dur_str_raw):\n",
    "    pieces=dur_str_raw.split(' = ')\n",
    "    duration=dur2sec(pieces[0])\n",
    "    return duration\n",
    "\n",
    "def get_label(path):\n",
    "    return path.split('\\\\')[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_info['duration_sec']=df_test_info.Duration.apply(get_dur)\n",
    "df_train_info['duration_sec']=df_train_info.Duration.apply(get_dur)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_info['label']=df_test_info['Input File'].apply(get_label)\n",
    "df_train_info['label']=df_train_info['Input File'].apply(get_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.3382916666666667"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_info.duration_sec.sum()/60/60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9.404272222222222"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_info.duration_sec.sum()/60/60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "label\n",
       "_background_noise_splitted_    6.639833\n",
       "down                           6.662667\n",
       "go                             6.548333\n",
       "left                           6.791833\n",
       "no                             6.692333\n",
       "off                            6.589833\n",
       "on                             6.490833\n",
       "right                          6.489500\n",
       "stop                           6.769500\n",
       "unknown                        6.756833\n",
       "up                             6.970333\n",
       "yes                            6.895667\n",
       "Name: duration_sec, dtype: float64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_info.groupby('label')['duration_sec'].sum()/60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "label\n",
       "_background_noise_splitted_     6.639833\n",
       "down                           51.348333\n",
       "go                             50.656500\n",
       "left                           49.797000\n",
       "no                             51.115667\n",
       "off                            48.744833\n",
       "on                             50.508167\n",
       "right                          49.430667\n",
       "stop                           51.022667\n",
       "unknown                        54.145167\n",
       "up                             47.919667\n",
       "yes                            52.927833\n",
       "Name: duration_sec, dtype: float64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_info.groupby('label')['duration_sec'].sum()/60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
